{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('./data/nCoV_100k_train.labled.csv',engine ='python')\n",
    "df_test = pd.read_csv('./data/nCov_10k_test.csv',engine ='python')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               微博id        微博发布时间      发布人账号  \\\n",
      "0  4456072029125500  01月01日 23:50     存曦1988   \n",
      "1  4456074167480980  01月01日 23:58   LunaKrys   \n",
      "2  4456054253264520  01月01日 22:39  小王爷学辩论o_O   \n",
      "3  4456061509126470  01月01日 23:08         芩鎟   \n",
      "4  4455979322528190  01月01日 17:42   changlwj   \n",
      "\n",
      "                                              微博中文内容  \\\n",
      "0  写在年末冬初孩子流感的第五天，我们仍然没有忘记热情拥抱这2020年的第一天。带着一丝迷信，早...   \n",
      "1    开年大模型…累到以为自己发烧了腰疼膝盖疼腿疼胳膊疼脖子疼#Luna的Krystallife#?   \n",
      "2  �偳癯空饩褪俏业�，爹，发烧快好，毕竟美好的假期拿来养病不太好，假期还是要好好享受快乐，爹，...   \n",
      "3                     新年的第一天感冒又发烧的也太衰了但是我要想着明天一定会好的?   \n",
      "4  问：我们意念里有坏的想法了，天神就会给记下来，那如果有好的想法也会被记下来吗？答：那当然了。...   \n",
      "\n",
      "                                                微博图片 微博视频 情感倾向  \n",
      "0  ['https://ww2.sinaimg.cn/orj360/005VnA1zly1gah...   []    0  \n",
      "1                                                 []   []   -1  \n",
      "2  ['https://ww2.sinaimg.cn/thumb150/006ymYXKgy1g...   []    1  \n",
      "3  ['https://ww2.sinaimg.cn/orj360/005FL9LZgy1gah...   []    1  \n",
      "4                                                 []   []    1  \n",
      "               微博id        微博发布时间       发布人账号  \\\n",
      "0  4456068992182160  01月01日 23:38  -精緻的豬豬女戰士-   \n",
      "1  4456424178427250  01月02日 23:09  liujunyi88   \n",
      "2  4456797466940200  01月03日 23:53       ablsa   \n",
      "3  4456791021108920  01月03日 23:27    喵吃鱼干Lynn   \n",
      "4  4457086404997440  01月04日 19:01   我的发小今年必脱单   \n",
      "\n",
      "                                              微博中文内容  \\\n",
      "0  #你好2020#新年第一天元气满满的早起出门买早饭结果高估了自己抗冻能力回家成功冻发烧（大概...   \n",
      "1  大宝又感冒鼻塞咳嗽了，还有发烧。队友加班几天不回。感觉自己的情绪在家已然是随时引爆的状态。情...   \n",
      "2                      还要去输两天液，这天也太容易感冒发烧了，一定要多喝热水啊?   \n",
      "3                            我太难了别人怎么发烧都没事就我一检查甲型流感?   \n",
      "4  果然是要病一场的喽回来第三天开始感冒今儿还发烧了喉咙眼睛都难受的一匹怎么样能不经意让我的毕设...   \n",
      "\n",
      "                                                微博图片 微博视频  \n",
      "0  ['https://ww2.sinaimg.cn/thumb150/745aa591ly1g...   []  \n",
      "1                                                 []   []  \n",
      "2  ['https://ww3.sinaimg.cn/orj360/006fTidCly1gaj...   []  \n",
      "3                                                 []   []  \n",
      "4                                                 []   []  \n"
     ]
    }
   ],
   "source": [
    "print(df_train.head())\n",
    "print(df_test.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def cut_words(sentence):\n",
    "    #print sentence\n",
    "    return \" \".join(jieba.cut(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "MAX_SEQUENCE_LENGTH = 100\n",
    "input_categories = '微博中文内容'\n",
    "output_categories = '情感倾向'\n",
    "df_train = df_train[df_train[output_categories].isin(['-1','0','1'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\37473\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.692 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "df_train['sentence_cuted'] =  df_train[input_categories].astype(str).apply(cut_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "df_test['sentence_cuted'] =  df_test[input_categories].astype(str).apply(cut_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>微博id</th>\n",
       "      <th>微博发布时间</th>\n",
       "      <th>发布人账号</th>\n",
       "      <th>微博中文内容</th>\n",
       "      <th>微博图片</th>\n",
       "      <th>微博视频</th>\n",
       "      <th>sentence_cuted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4456068992182160</td>\n",
       "      <td>01月01日 23:38</td>\n",
       "      <td>-精緻的豬豬女戰士-</td>\n",
       "      <td>#你好2020#新年第一天元气满满的早起出门买早饭结果高估了自己抗冻能力回家成功冻发烧（大概...</td>\n",
       "      <td>['https://ww2.sinaimg.cn/thumb150/745aa591ly1g...</td>\n",
       "      <td>[]</td>\n",
       "      <td># 你好 2020 # 新年 第一天 元气 满满的 早起 出门 买 早饭 结果 高估 了 自...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4456424178427250</td>\n",
       "      <td>01月02日 23:09</td>\n",
       "      <td>liujunyi88</td>\n",
       "      <td>大宝又感冒鼻塞咳嗽了，还有发烧。队友加班几天不回。感觉自己的情绪在家已然是随时引爆的状态。情...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>大宝 又 感冒 鼻塞 咳嗽 了 ， 还有 发烧 。 队友 加班 几天 不回 。 感觉 自己 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4456797466940200</td>\n",
       "      <td>01月03日 23:53</td>\n",
       "      <td>ablsa</td>\n",
       "      <td>还要去输两天液，这天也太容易感冒发烧了，一定要多喝热水啊?</td>\n",
       "      <td>['https://ww3.sinaimg.cn/orj360/006fTidCly1gaj...</td>\n",
       "      <td>[]</td>\n",
       "      <td>还要 去 输 两天 液 ， 这天 也 太 容易 感冒 发烧 了 ， 一定 要 多 喝 热水 啊 ?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4456791021108920</td>\n",
       "      <td>01月03日 23:27</td>\n",
       "      <td>喵吃鱼干Lynn</td>\n",
       "      <td>我太难了别人怎么发烧都没事就我一检查甲型流感?</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>我太难 了 别人 怎么 发烧 都 没事 就 我 一 检查 甲型 流感 ?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4457086404997440</td>\n",
       "      <td>01月04日 19:01</td>\n",
       "      <td>我的发小今年必脱单</td>\n",
       "      <td>果然是要病一场的喽回来第三天开始感冒今儿还发烧了喉咙眼睛都难受的一匹怎么样能不经意让我的毕设...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>果然 是 要 病 一场 的 喽 回来 第三天 开始 感冒 今儿 还 发烧 了 喉咙 眼睛 都...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               微博id        微博发布时间       发布人账号  \\\n",
       "0  4456068992182160  01月01日 23:38  -精緻的豬豬女戰士-   \n",
       "1  4456424178427250  01月02日 23:09  liujunyi88   \n",
       "2  4456797466940200  01月03日 23:53       ablsa   \n",
       "3  4456791021108920  01月03日 23:27    喵吃鱼干Lynn   \n",
       "4  4457086404997440  01月04日 19:01   我的发小今年必脱单   \n",
       "\n",
       "                                              微博中文内容  \\\n",
       "0  #你好2020#新年第一天元气满满的早起出门买早饭结果高估了自己抗冻能力回家成功冻发烧（大概...   \n",
       "1  大宝又感冒鼻塞咳嗽了，还有发烧。队友加班几天不回。感觉自己的情绪在家已然是随时引爆的状态。情...   \n",
       "2                      还要去输两天液，这天也太容易感冒发烧了，一定要多喝热水啊?   \n",
       "3                            我太难了别人怎么发烧都没事就我一检查甲型流感?   \n",
       "4  果然是要病一场的喽回来第三天开始感冒今儿还发烧了喉咙眼睛都难受的一匹怎么样能不经意让我的毕设...   \n",
       "\n",
       "                                                微博图片 微博视频  \\\n",
       "0  ['https://ww2.sinaimg.cn/thumb150/745aa591ly1g...   []   \n",
       "1                                                 []   []   \n",
       "2  ['https://ww3.sinaimg.cn/orj360/006fTidCly1gaj...   []   \n",
       "3                                                 []   []   \n",
       "4                                                 []   []   \n",
       "\n",
       "                                      sentence_cuted  \n",
       "0  # 你好 2020 # 新年 第一天 元气 满满的 早起 出门 买 早饭 结果 高估 了 自...  \n",
       "1  大宝 又 感冒 鼻塞 咳嗽 了 ， 还有 发烧 。 队友 加班 几天 不回 。 感觉 自己 ...  \n",
       "2  还要 去 输 两天 液 ， 这天 也 太 容易 感冒 发烧 了 ， 一定 要 多 喝 热水 啊 ?  \n",
       "3               我太难 了 别人 怎么 发烧 都 没事 就 我 一 检查 甲型 流感 ?  \n",
       "4  果然 是 要 病 一场 的 喽 回来 第三天 开始 感冒 今儿 还 发烧 了 喉咙 眼睛 都...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def compute_output_arrays(df, columns):\n",
    "    return np.asarray(df[columns].astype(int) + 1)\n",
    "outputs = compute_output_arrays(df_train, output_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 2, ..., 1, 2, 1])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "train = df_train\n",
    "test = df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000, \n",
    "                                                  lower=False,filters=\"\")# 对数据构造词典\n",
    "tokenizer.fit_on_texts(train['sentence_cuted'].tolist()+test['sentence_cuted'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "train_ = tokenizer.texts_to_sequences(train['sentence_cuted'].values)# 文本 ---> 数字\n",
    "test_ = tokenizer.texts_to_sequences(test['sentence_cuted'].values)# 同上"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "99913\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "92"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(len(train_))\n",
    "len(train_[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TFIDF构建文本特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vec = TfidfVectorizer(analyzer='word',\n",
    "            ngram_range=(1,2),#(1,3)\n",
    "            min_df=3,  # 4  5\n",
    "            max_df=0.9, # 0.95 1.0 \n",
    "            use_idf=True,\n",
    "            smooth_idf=True, \n",
    "            sublinear_tf=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_term_doc = word_vec.fit_transform(train['sentence_cuted'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_term_doc = word_vec.transform(test['sentence_cuted'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<99913x196053 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 3559779 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_term_doc#sparse matrix 102277x2820641  np.array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<10000x196053 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 335787 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_term_doc  #102277x2820641 稀疏矩阵可以直接在逻辑回归里面调用"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 逻辑回归构建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\tempbeforenewdisk\\anaconda3\\envs\\jupyter\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=4, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "clf = LogisticRegression(C=4, dual=False) # c 1.0  2 3 4 5  200w 10w\n",
    "clf.fit(train_term_doc,outputs)#[0,18]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_prob = clf.predict_proba(test_term_doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.49294859, 0.17924776, 0.32780364])"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_prob[0]# 4 +1 =5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 结果提交"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_pred = np.argmax(test_prob,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 1, ..., 1, 1, 1], dtype=int64)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sub = df_test[['微博id']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>微博id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4456068992182160</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4456424178427250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4456797466940200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4456791021108920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4457086404997440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>4464179518243680</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>4464274073923100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>4464289160945130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>4465347950314820</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>4465492650005690</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  微博id\n",
       "0     4456068992182160\n",
       "1     4456424178427250\n",
       "2     4456797466940200\n",
       "3     4456791021108920\n",
       "4     4457086404997440\n",
       "...                ...\n",
       "9995  4464179518243680\n",
       "9996  4464274073923100\n",
       "9997  4464289160945130\n",
       "9998  4465347950314820\n",
       "9999  4465492650005690\n",
       "\n",
       "[10000 rows x 1 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\tempbeforenewdisk\\anaconda3\\envs\\jupyter\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "df_sub['y'] = test_pred-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>微博id</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4456068992182160</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4456424178427250</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4456797466940200</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4456791021108920</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4457086404997440</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>4464179518243680</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>4464274073923100</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>4464289160945130</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>4465347950314820</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>4465492650005690</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  微博id  y\n",
       "0     4456068992182160 -1\n",
       "1     4456424178427250  0\n",
       "2     4456797466940200  0\n",
       "3     4456791021108920 -1\n",
       "4     4457086404997440 -1\n",
       "...                ... ..\n",
       "9995  4464179518243680  0\n",
       "9996  4464274073923100  0\n",
       "9997  4464289160945130  0\n",
       "9998  4465347950314820  0\n",
       "9999  4465492650005690  0\n",
       "\n",
       "[10000 rows x 2 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sub.columns=['id','y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4456068992182160</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4456424178427250</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4456797466940200</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4456791021108920</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4457086404997440</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id  y\n",
       "0  4456068992182160 -1\n",
       "1  4456424178427250  0\n",
       "2  4456797466940200  0\n",
       "3  4456791021108920 -1\n",
       "4  4457086404997440 -1"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sub.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sub.to_csv('test_sub.csv',index=False, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
